##
## Material para replica��o do Anexo 3 - LDA
## T�tulo: G�nero, din�micas de poder intrapartid�rias e manterrupting no Legislativo
## Autores: Mauricio Izumi e Debora Thome
##

#diret�rio de trabalho
setwd("C:\\Users\\Mauricio\\Desktop\pc\\pesquisas\\discursos_sf\\replication\\anexo03")

#carrega pacotes
library(dplyr)
library(data.table)
library(tm)
library(topicmodels)
library(tidytext)

#carrega fun��es
#concatena as frases
concat <- function(x){
	res <- ""
	for(i in 1:length(x)){
		res <- paste(res, x[i])
	}
	res
}

#concatena os termos
concat2 <- function(x){
	res <- ""
	for(i in 1:length(x)){
		res <- paste(res, x[i], sep = ", ")
	}
	res
}

#carrega dados
tab <- fread("discursos_stem_final.csv.gz")

#remove discursos com frases vazias
#[1] 423958-0 376410-0 382019-0 360609-0 373068-0 405076-0 390572-0 392743-0
#[9] 425693-0
#
#tab <- subset(tab, !(tab$id_disc %in% c(423958,376410,382019,360609,373068,405076,390572,392743,425693)))

#########################

##
## Processamento dos dados
##

#agrupa dados por id_disc e dummy_autor
#objetivo: agrupar as falas do autor e dos aparteantes de um mesmo discurso
tab$id <- paste(tab$id_disc, tab$dummy_autor, sep = "-")

id <- unique(tab$id)
fala_sen <- NULL
for(i in 1:length(id)){
	temp <- subset(tab$fala_limpa, tab$id == id[i])
	fala_sen[i] <- concat(temp)
	cat(i, "\n")
}
dados <- data.frame(id, fala_sen)

######################################

#Cria o Corpus
corpus <- VCorpus(VectorSource(dados$fala_sen))

#Cria o DTM
ndocs <- length(corpus)
minDocFreq <- ndocs * 0.01 #ignore sparse terms (apperaing in less than 1% of the docs)
maxDocFreq <- ndocs * 0.99 # ignore common terms (appearing in more than 99% of the docs)
dtm <- DocumentTermMatrix(corpus, control = list(bounds = list(global = c(minDocFreq, maxDocFreq))))

#verifica falas que ficaram com zero termos ap�s a limpeza
#Preciso excluir os discuros em que isso ocorre e refazer o processo!!!
#teste <- as.matrix(dtm)
#temp <- apply(teste, 1, sum)
#dados[which(temp == 0),1]

#######################

##
## Latent Dirichlet allocation
##

ntopics <- 37 #number of topics
result <- LDA(dtm, k = ntopics, control = list(seed = 1234))

#Word-topic probabilities
#For each combination, the model computes the probability
#of that term being generated from that topic.
#For example, the first line indicates the probability of
#that that being generated from that topic.

res_topics <- tidy(result, matrix = "beta")

#We can visualize the 10 terms that are most common within each topic
res_top_terms <- res_topics %>%
	group_by(topic) %>%
	top_n(10, beta) %>%
	arrange(topic, -beta)

terms <- NULL
for(i in 1:ntopics){
	temp <- subset(res_top_terms$term, res_top_terms$topic == i)
	terms[i] <- concat2(temp)
	terms[i] <- gsub("^, ", "", terms[i])
}
tab.terms <- data.frame(topic = c(1:ntopics), terms)

#Document-topic probabilities
#Each of these values is an estimated proportion of words 
#from that document that are generated from that topic.
#For example, the model estimates that only about xx% of the
#words in document 1 were generated from topic 1.

res_documents <- tidy(result, matrix = "gamma")
res_documents

#We can classify each document in the most likely topic
topic_doc <- res_documents %>%
	group_by(document) %>%
	top_n(1, gamma) %>%
	arrange(as.numeric(document))

tab.terms
topic_doc

write.csv(res_topics, file = gzfile("topics37.csv.gz"))
write.csv(res_documents, file = gzfile("documents37.csv.gz"))